Introduction

In this report, we extract information about published JOSS papers and generate graphics as well as a summary table that can be downloaded and used for further analyses.

Load required R packages

suppressPackageStartupMessages({
  library(tibble)
  library(rcrossref)
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(lubridate)
  library(gh)
  library(purrr)
  library(jsonlite)
  library(DT)
  library(plotly)
  library(citecorp)
  library(readr)
})
## Keep track of the source of each column
source_track <- c()

## Determine whether to add a caption with today's date to the (non-interactive) plots
add_date_caption <- TRUE
if (add_date_caption) {
  dcap <- lubridate::today()
} else {
  dcap <- ""
}
## Read archived version of summary data frame, to use for filling in 
## information about software repositories (due to limit on API requests)
## Sort by the date when software repo info was last obtained
papers_archive <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true"))) %>%
  dplyr::arrange(!is.na(repo_info_obtained), repo_info_obtained)

## Similarly for citation analysis, to avoid having to pull down the 
## same information multiple times
citations_archive <- readr::read_delim(
  url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_citations.tsv?raw=true"),
  col_types = cols(.default = "c"), col_names = TRUE,
  delim = "\t")

Collect information about papers

Pull down papers and citation info from Crossref

We get the information about published JOSS papers from Crossref, using the rcrossref R package. This package is also used to extract citation counts.

## Fetch JOSS papers from Crossref
## Only 1000 papers at the time can be pulled down
lim <- 1000
papers <- rcrossref::cr_works(filter = c(issn = "2475-9066"), 
                              limit = lim)$data
i <- 1
while (nrow(papers) == i * lim) {
  papers <- dplyr::bind_rows(
    papers, 
    rcrossref::cr_works(filter = c(issn = "2475-9066"), 
                        limit = lim, offset = i * lim)$data)
  i <- i + 1
}
papers <- papers %>%
  dplyr::filter(type == "journal-article") 

## A few papers don't have DOIs - generate them from the URL
noaltid <- which(is.na(papers$alternative.id))
papers$alternative.id[noaltid] <- gsub("http://dx.doi.org/", "",
                                       papers$url[noaltid])

## Get citation info from Crossref and merge with paper details
cit <- rcrossref::cr_citation_count(doi = papers$alternative.id)
papers <- papers %>% dplyr::left_join(
  cit %>% dplyr::rename(citation_count = count), 
  by = c("alternative.id" = "doi")
)
## Warning in dplyr::left_join(., cit %>% dplyr::rename(citation_count = count), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1991 of `x` matches multiple rows in `y`.
## ℹ Row 1994 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## Remove one duplicated paper
papers <- papers %>% dplyr::filter(alternative.id != "10.21105/joss.00688")

source_track <- c(source_track, 
                  structure(rep("crossref", ncol(papers)), 
                            names = colnames(papers)))

Pull down info from Whedon API

For each published paper, we use the Whedon API to get information about pre-review and review issue numbers, corresponding software repository etc.

whedon <- list()
p <- 1
a0 <- NULL
a <- jsonlite::fromJSON(
  url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
  simplifyDataFrame = FALSE
)
while (length(a) > 0 && !identical(a, a0)) {
  whedon <- c(whedon, a)
  p <- p + 1
  a0 <- a
  a <- tryCatch({
    jsonlite::fromJSON(
      url(paste0("https://joss.theoj.org/papers/published.json?page=", p)),
      simplifyDataFrame = FALSE
    )}, 
    error = function(e) return(numeric(0))
  )
}

whedon <- do.call(dplyr::bind_rows, lapply(whedon, function(w) {
  data.frame(api_title = w$title, 
             api_state = w$state,
             editor = paste(w$editor, collapse = ","),
             reviewers = paste(w$reviewers, collapse = ","),
             nbr_reviewers = length(w$reviewers),
             repo_url = w$software_repository,
             review_issue_id = sub("https://github.com/openjournals/joss-reviews/issues/", "", w$paper_review),
             doi = w$doi,
             prereview_issue_id = ifelse(!is.null(w$meta_review_issue_id),
                                         w$meta_review_issue_id, NA_integer_),
             languages = gsub(", ", ",", w$languages),
             # languages = paste(w$languages, collapse = ","),
             archive_doi = w$software_archive)
}))

papers <- papers %>% dplyr::left_join(whedon, by = c("alternative.id" = "doi"))

source_track <- c(source_track, 
                  structure(rep("whedon", length(setdiff(colnames(papers),
                                                         names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Combine with info from GitHub issues

From each pre-review and review issue, we extract information about review times and assigned labels.

## Pull down info on all issues in the joss-reviews repository
issues <- gh("/repos/openjournals/joss-reviews/issues", 
             .limit = 5000, state = "all")
## From each issue, extract required information
iss <- do.call(dplyr::bind_rows, lapply(issues, function(i) {
  data.frame(title = i$title, 
             number = i$number,
             state = i$state,
             opened = i$created_at,
             closed = ifelse(!is.null(i$closed_at),
                             i$closed_at, NA_character_),
             ncomments = i$comments,
             labels = paste(setdiff(
               vapply(i$labels, getElement, 
                      name = "name", character(1L)),
               c("review", "pre-review", "query-scope", "paused")),
               collapse = ","))
}))

## Split into REVIEW, PRE-REVIEW, and other issues (the latter category 
## is discarded)
issother <- iss %>% dplyr::filter(!grepl("\\[PRE REVIEW\\]", title) & 
                                    !grepl("\\[REVIEW\\]", title))
dim(issother)
## [1] 129   7
head(issother)
##                                                                                                                                           title
## 1                                                        Questions about "statement of need" and the relative contribution of the three authors
## 2                                                                                         how to include error in the gala dynamics calculation
## 3 Thanks @cudmore for taking the time to review this. Your valuable comments and suggestions greatly improved the quality of the documentation.
## 4                                                                                                # Post-Review Checklist for Editor and Authors
## 5                                                                                                                                           org
## 6                                                                                                                                 @csoneson - I
##   number  state               opened               closed ncomments labels
## 1   6360 closed 2024-02-16T09:50:43Z 2024-02-16T09:50:45Z         1       
## 2   6337 closed 2024-02-08T14:36:25Z 2024-02-08T14:36:27Z         1       
## 3   6262 closed 2024-01-23T02:39:54Z 2024-01-23T02:39:56Z         1       
## 4   5948 closed 2023-10-14T16:25:03Z 2023-10-14T16:25:06Z         1       
## 5   5709 closed 2023-07-31T07:23:26Z 2023-07-31T07:23:28Z         1       
## 6   5708 closed 2023-07-29T18:30:07Z 2023-07-29T18:30:09Z         1
## For REVIEW issues, generate the DOI of the paper from the issue number
getnbrzeros <- function(s) {
  paste(rep(0, 5 - nchar(s)), collapse = "")
}
issrev <- iss %>% dplyr::filter(grepl("\\[REVIEW\\]", title)) %>%
  dplyr::mutate(nbrzeros = purrr::map_chr(number, getnbrzeros)) %>%
  dplyr::mutate(alternative.id = paste0("10.21105/joss.", 
                                        nbrzeros,
                                        number)) %>%
  dplyr::select(-nbrzeros) %>% 
  dplyr::mutate(title = gsub("\\[REVIEW\\]: ", "", title)) %>%
  dplyr::rename_at(vars(-alternative.id), ~ paste0("review_", .))
## For pre-review and review issues, respectively, get the number of 
## issues closed each month, and the number of those that have the 
## 'rejected' label
review_rejected <- iss %>% 
  dplyr::filter(grepl("\\[REVIEW\\]", title)) %>% 
  dplyr::filter(!is.na(closed)) %>%
  dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
  dplyr::group_by(closedmonth) %>%
  dplyr::summarize(nbr_issues_closed = length(labels),
                   nbr_rejections = sum(grepl("rejected", labels))) %>%
  dplyr::mutate(itype = "review")

prereview_rejected <- iss %>% 
  dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>% 
  dplyr::filter(!is.na(closed)) %>%
  dplyr::mutate(closedmonth = lubridate::floor_date(as.Date(closed), "month")) %>%
  dplyr::group_by(closedmonth) %>%
  dplyr::summarize(nbr_issues_closed = length(labels),
                   nbr_rejections = sum(grepl("rejected", labels))) %>%
  dplyr::mutate(itype = "pre-review")

all_rejected <- dplyr::bind_rows(review_rejected, prereview_rejected)
## For PRE-REVIEW issues, add information about the corresponding REVIEW 
## issue number
isspre <- iss %>% dplyr::filter(grepl("\\[PRE REVIEW\\]", title)) %>%
  dplyr::filter(!grepl("withdrawn", labels)) %>%
  dplyr::filter(!grepl("rejected", labels))
## Some titles have multiple pre-review issues. In these cases, keep the latest
isspre <- isspre %>% dplyr::arrange(desc(number)) %>% 
  dplyr::filter(!duplicated(title)) %>% 
  dplyr::mutate(title = gsub("\\[PRE REVIEW\\]: ", "", title)) %>%
  dplyr::rename_all(~ paste0("prerev_", .))

papers <- papers %>% dplyr::left_join(issrev, by = "alternative.id") %>% 
  dplyr::left_join(isspre, by = c("prereview_issue_id" = "prerev_number")) %>%
  dplyr::mutate(prerev_opened = as.Date(prerev_opened),
                prerev_closed = as.Date(prerev_closed),
                review_opened = as.Date(review_opened),
                review_closed = as.Date(review_closed)) %>% 
  dplyr::mutate(days_in_pre = prerev_closed - prerev_opened,
                days_in_rev = review_closed - review_opened,
                to_review = !is.na(review_opened))

source_track <- c(source_track, 
                  structure(rep("joss-github", length(setdiff(colnames(papers),
                                                              names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Add information from software repositories

## Reorder so that software repositories that were interrogated longest 
## ago are checked first
tmporder <- order(match(papers$alternative.id, papers_archive$alternative.id),
                  na.last = FALSE)
software_urls <- papers$repo_url[tmporder]
is_github <- grepl("github", software_urls)
length(is_github)
## [1] 2402
sum(is_github)
## [1] 2264
software_urls[!is_github]
##   [1] "https://gitlab.com/jesseds/apav"                                                 
##   [2] "https://plmlab.math.cnrs.fr/lmrs/statistique/smmR"                               
##   [3] "https://gitlab.com/mauricemolli/petitRADTRANS"                                   
##   [4] "https://gitlab.inria.fr/bramas/tbfmm"                                            
##   [5] "https://gitlab.com/dmt-development/dmt-core"                                     
##   [6] "https://gitlab.com/fduchate/predihood"                                           
##   [7] "https://gitlab.com/myqueue/myqueue"                                              
##   [8] "https://git.ligo.org/asimov/asimov"                                              
##   [9] "https://jugit.fz-juelich.de/compflu/swalbe.jl/"                                  
##  [10] "https://gitlab.kuleuven.be/ITSCreaLab/public-toolboxes/dyntapy"                  
##  [11] "https://gitlab.com/utopia-project/utopia"                                        
##  [12] "https://gitlab.dune-project.org/dorie/dorie"                                     
##  [13] "https://gitlab.pasteur.fr/vlegrand/ROCK"                                         
##  [14] "https://gitlab.com/moerman1/fhi-cc4s"                                            
##  [15] "https://gitlab.com/ENKI-portal/ThermoCodegen"                                    
##  [16] "https://gitlab.com/wpettersson/kep_solver"                                       
##  [17] "https://bitbucket.org/orionmhdteam/orion2_release1/src/master/"                  
##  [18] "https://gitlab.com/pyFBS/pyFBS"                                                  
##  [19] "https://gitlab.com/mmartin-lagarde/exonoodle-exoplanets/-/tree/master/"          
##  [20] "https://bitbucket.org/meg/cbcbeat"                                               
##  [21] "https://gitlab.com/jtagusari/hrisk-noisemodelling"                               
##  [22] "https://gitlab.mpikg.mpg.de/curcuraci/bmiptools"                                 
##  [23] "https://bitbucket.org/cardosan/brightway2-temporalis"                            
##  [24] "https://savannah.nongnu.org/projects/complot/"                                   
##  [25] "https://gitlab.com/ProjectRHEA/flowsolverrhea"                                   
##  [26] "https://gitlab.inria.fr/miet/miet"                                               
##  [27] "https://gitlab.com/jason-rumengan/pyarma"                                        
##  [28] "http://mutabit.com/repos.fossil/grafoscopio/"                                    
##  [29] "https://gitlab.com/libreumg/dataquier.git"                                       
##  [30] "https://bitbucket.org/manuela_s/hcp/"                                            
##  [31] "https://bitbucket.org/hammurabicode/hamx"                                        
##  [32] "https://gitlab.com/petsc/petsc"                                                  
##  [33] "https://gitlab.inria.fr/bcoye/game-engine-scheduling-simulation"                 
##  [34] "https://gite.lirmm.fr/doccy/RedOak"                                              
##  [35] "https://gitlab.com/utopia-project/dantro"                                        
##  [36] "https://bitbucket.org/berkeleylab/hardware-control/src/main/"                    
##  [37] "https://gitlab.com/cosmograil/starred"                                           
##  [38] "https://gitlab.com/fibreglass/pivc"                                              
##  [39] "https://gitlab.com/culturalcartography/text2map"                                 
##  [40] "https://codebase.helmholtz.cloud/mussel/netlogo-northsea-species.git"            
##  [41] "https://gitlab.com/ffaucher/hawen"                                               
##  [42] "https://gitlab.com/cerfacs/batman"                                               
##  [43] "https://gitlab.com/akantu/akantu"                                                
##  [44] "https://gitlab.com/gdetor/genetic_alg"                                           
##  [45] "https://gitlab.com/manchester_qbi/manchester_qbi_public/madym_cxx/"              
##  [46] "https://gitlab.com/emd-dev/emd"                                                  
##  [47] "https://gricad-gitlab.univ-grenoble-alpes.fr/ttk/spam/"                          
##  [48] "https://gitlab.com/bonsamurais/bonsai/util/ipcc"                                 
##  [49] "https://gitlab.gwdg.de/mpievolbio-it/crbhits"                                    
##  [50] "https://bitbucket.org/rram/dvrlib/src/joss/"                                     
##  [51] "https://gitlab.ethz.ch/holukas/dyco-dynamic-lag-compensation"                    
##  [52] "https://gitlab.com/dlr-dw/ontocode"                                              
##  [53] "https://gitlab.com/project-dare/dare-platform"                                   
##  [54] "https://earth.bsc.es/gitlab/wuruchi/autosubmitreact"                             
##  [55] "https://gitlab.com/vibes-developers/vibes"                                       
##  [56] "https://gitlab.com/picos-api/picos"                                              
##  [57] "https://bitbucket.org/clhaley/Multitaper.jl"                                     
##  [58] "https://gitlab.com/sails-dev/sails"                                              
##  [59] "https://gitlab.com/marinvaders/marinvaders"                                      
##  [60] "https://git.rwth-aachen.de/ants/sensorlab/imea"                                  
##  [61] "https://gitlab.com/InspectorCell/inspectorcell"                                  
##  [62] "https://bitbucket.org/bmskinner/nuclear_morphology"                              
##  [63] "https://bitbucket.org/sbarbot/motorcycle/src/master/"                            
##  [64] "https://gitlab.com/binary_c/binary_c-python/"                                    
##  [65] "https://gitlab.inria.fr/melissa/melissa"                                         
##  [66] "https://gitlab.com/sissopp_developers/sissopp"                                   
##  [67] "https://framagit.org/GustaveCoste/off-product-environmental-impact/"             
##  [68] "https://gitlab.com/tum-ciip/elsa"                                                
##  [69] "https://gitlab.com/cosapp/cosapp"                                                
##  [70] "https://gitlab.com/dlr-ve/esy/amiris/amiris"                                     
##  [71] "https://gitlab.com/remram44/taguette"                                            
##  [72] "https://gitlab.uliege.be/smart_grids/public/gboml"                               
##  [73] "https://bitbucket.org/mpi4py/mpi4py-fft"                                         
##  [74] "https://gitlab.kitware.com/LBM/lattice-boltzmann-solver"                         
##  [75] "https://gitlab.com/eidheim/Simple-Web-Server"                                    
##  [76] "https://gitlab.com/cracklet/cracklet.git"                                        
##  [77] "https://gitlab.com/toposens/public/ros-packages"                                 
##  [78] "https://bitbucket.org/cdegroot/wediff"                                           
##  [79] "https://bitbucket.org/basicsums/basicsums"                                       
##  [80] "https://gitlab.inria.fr/azais/treex"                                             
##  [81] "https://bitbucket.org/glotzer/rowan"                                             
##  [82] "https://gitlab.ifremer.fr/resourcecode/resourcecode"                             
##  [83] "https://gitlab.com/bioeconomy/forobs/biotrade/"                                  
##  [84] "https://gitlab.com/soleil-data-treatment/soleil-software-projects/remote-desktop"
##  [85] "https://gitlab.com/pvst/asi"                                                     
##  [86] "https://git.geomar.de/digital-earth/dasf/dasf-messaging-python"                  
##  [87] "https://gitlab.com/sigcorr/sigcorr"                                              
##  [88] "https://gitlab.com/dsbowen/conditional-inference"                                
##  [89] "https://gitlab.com/thartwig/asloth"                                              
##  [90] "https://code.usgs.gov/umesc/quant-ecology/fishstan/"                             
##  [91] "https://gitlab.com/QComms/cqptoolkit"                                            
##  [92] "https://bitbucket.org/sciencecapsule/sciencecapsule"                             
##  [93] "https://www.idpoisson.fr/fullswof/"                                              
##  [94] "https://framagit.org/GustaveCoste/eldam"                                         
##  [95] "https://gitlab.com/fame-framework/fame-io"                                       
##  [96] "https://gitlab.com/fame-framework/fame-core"                                     
##  [97] "https://bitbucket.org/miketuri/perl-spice-sim-seus/"                             
##  [98] "https://bitbucket.org/ocellarisproject/ocellaris"                                
##  [99] "https://gitlab.inria.fr/mosaic/bvpy"                                             
## [100] "https://gitlab.com/cosmograil/PyCS3"                                             
## [101] "https://bitbucket.org/berkeleylab/esdr-pygdh/"                                   
## [102] "https://gitlab.com/LMSAL_HUB/aia_hub/aiapy"                                      
## [103] "https://sourceforge.net/p/mcapl/mcapl_code/ci/master/tree/"                      
## [104] "https://gitlab.com/davidtourigny/dynamic-fba"                                    
## [105] "https://gitlab.com/moorepants/skijumpdesign"                                     
## [106] "https://git.iws.uni-stuttgart.de/tools/frackit"                                  
## [107] "https://gitlab.com/chaver/choco-mining"                                          
## [108] "https://gitlab.com/habermann_lab/phasik"                                         
## [109] "https://gitlab.com/drti/basic-tools"                                             
## [110] "https://gitlab.com/ags-data-format-wg/ags-python-library"                        
## [111] "https://gitlab.com/dlr-ve/autumn/"                                               
## [112] "https://bitbucket.org/likask/mofem-cephas"                                       
## [113] "https://bitbucket.org/dolfin-adjoint/pyadjoint"                                  
## [114] "https://gitlab.com/materials-modeling/wulffpack"                                 
## [115] "https://gitlab.com/ampere2/metalwalls"                                           
## [116] "https://gitlab.com/energyincities/besos/"                                        
## [117] "https://gitlab.com/tesch1/cppduals"                                              
## [118] "https://gitlab.com/geekysquirrel/bigx"                                           
## [119] "https://bitbucket.org/cloopsy/android/"                                          
## [120] "https://gitlab.com/celliern/scikit-fdiff/"                                       
## [121] "https://bitbucket.org/dghoshal/frieda"                                           
## [122] "https://gitlab.com/gims-developers/gims"                                         
## [123] "https://gitlab.com/tue-umphy/software/parmesan"                                  
## [124] "https://gitlab.com/costrouc/pysrim"                                              
## [125] "https://doi.org/10.17605/OSF.IO/3DS6A"                                           
## [126] "https://gitlab.com/programgreg/tagginglatencyestimator"                          
## [127] "https://git.mpib-berlin.mpg.de/castellum/castellum"                              
## [128] "https://gitlab.com/pythia-uq/pythia"                                             
## [129] "https://gitlab.com/dglaeser/fieldcompare"                                        
## [130] "https://gitlab.com/dlr-ve/esy/sfctools/framework/"                               
## [131] "https://gitlab.com/robizzard/libcdict"                                           
## [132] "https://gitlab.awi.de/sicopolis/sicopolis"                                       
## [133] "https://gitlab.com/permafrostnet/teaspoon"                                       
## [134] "https://bitbucket.org/mituq/muq2.git"                                            
## [135] "https://gitlab.com/materials-modeling/calorine"                                  
## [136] "https://gitlab.ruhr-uni-bochum.de/reichp2y/proppy"                               
## [137] "https://c4science.ch/source/tamaas/"                                             
## [138] "https://gitlab.com/datafold-dev/datafold/"
df <- do.call(dplyr::bind_rows, lapply(software_urls[is_github], function(u) {
  u0 <- gsub("^http://", "https://", gsub("\\.git$", "", gsub("/$", "", u)))
  if (grepl("/tree/", u0)) {
    u0 <- strsplit(u0, "/tree/")[[1]][1]
  }
  if (grepl("/blob/", u0)) {
    u0 <- strsplit(u0, "/blob/")[[1]][1]
  }
  info <- try({
    gh(gsub("(https://)?(www.)?github.com/", "/repos/", u0))
  })
  languages <- try({
    gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/languages"), 
       .limit = 500)
  })
  topics <- try({
    gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/topics"), 
       .accept = "application/vnd.github.mercy-preview+json", .limit = 500)
  })
  contribs <- try({
    gh(paste0(gsub("(https://)?(www.)?github.com/", "/repos/", u0), "/contributors"), 
       .limit = 500)
  })
  if (!is(info, "try-error") && length(info) > 1) {
    if (!is(contribs, "try-error")) {
      if (length(contribs) == 0) {
        repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
      } else {
        repo_nbr_contribs <- length(contribs)
        repo_nbr_contribs_2ormore <- sum(vapply(contribs, function(x) x$contributions >= 2, NA_integer_))
        if (is.na(repo_nbr_contribs_2ormore)) {
          print(contribs)
        }
      }
    } else {
      repo_nbr_contribs <- repo_nbr_contribs_2ormore <- NA_integer_
    }
    
    if (!is(languages, "try-error")) {
      if (length(languages) == 0) {
        repolang <- ""
      } else {
        repolang <- paste(paste(names(unlist(languages)), 
                                unlist(languages), sep = ":"), collapse = ",")
      }
    } else {
      repolang <- ""
    }
    
    if (!is(topics, "try-error")) {
      if (length(topics$names) == 0) {
        repotopics <- ""
      } else {
        repotopics <- paste(unlist(topics$names), collapse = ",")
      }
    } else {
      repotopics <- ""
    }
    
    data.frame(repo_url = u, 
               repo_created = info$created_at,
               repo_updated = info$updated_at,
               repo_pushed = info$pushed_at,
               repo_nbr_stars = info$stargazers_count,
               repo_language = ifelse(!is.null(info$language),
                                      info$language, NA_character_),
               repo_languages_bytes = repolang,
               repo_topics = repotopics,
               repo_license = ifelse(!is.null(info$license),
                                     info$license$key, NA_character_),
               repo_nbr_contribs = repo_nbr_contribs,
               repo_nbr_contribs_2ormore = repo_nbr_contribs_2ormore
    )
  } else {
    NULL
  }
})) %>%
  dplyr::mutate(repo_created = as.Date(repo_created),
                repo_updated = as.Date(repo_updated),
                repo_pushed = as.Date(repo_pushed)) %>%
  dplyr::distinct() %>%
  dplyr::mutate(repo_info_obtained = lubridate::today())
if (length(unique(df$repo_url)) != length(df$repo_url)) {
  print(length(unique(df$repo_url)))
  print(length(df$repo_url))
  print(df$repo_url[duplicated(df$repo_url)])
}
stopifnot(length(unique(df$repo_url)) == length(df$repo_url))
dim(df)

## For papers not in df (i.e., for which we didn't get a valid response
## from the GitHub API query), use information from the archived data frame
dfarchive <- papers_archive %>% 
  dplyr::select(colnames(df)[colnames(df) %in% colnames(papers_archive)]) %>%
  dplyr::filter(!(repo_url %in% df$repo_url))
df <- dplyr::bind_rows(df, dfarchive)

papers <- papers %>% dplyr::left_join(df, by = "repo_url")

source_track <- c(source_track, 
                  structure(rep("sw-github", length(setdiff(colnames(papers),
                                                            names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Clean up a bit

## Convert publication date to Date format
## Add information about the half year (H1, H2) of publication
## Count number of authors
papers <- papers %>% dplyr::select(-reference, -license, -link) %>%
  dplyr::mutate(published.date = as.Date(published.print)) %>% 
  dplyr::mutate(
    halfyear = paste0(year(published.date), 
                      ifelse(month(published.date) <= 6, "H1", "H2"))
  ) %>% dplyr::mutate(
    halfyear = factor(halfyear, 
                      levels = paste0(rep(sort(unique(year(published.date))), 
                                          each = 2), c("H1", "H2")))
  ) %>% dplyr::mutate(nbr_authors = vapply(author, function(a) nrow(a), NA_integer_))
papers <- papers %>% dplyr::distinct()

source_track <- c(source_track, 
                  structure(rep("cleanup", length(setdiff(colnames(papers),
                                                          names(source_track)))), 
                            names = setdiff(colnames(papers), names(source_track))))

Tabulate number of missing values

In some cases, fetching information from (e.g.) the GitHub API fails for a subset of the publications. There are also other reasons for missing values (for example, the earliest submissions do not have an associated pre-review issue). The table below lists the number of missing values for each of the variables in the data frame.

DT::datatable(
  data.frame(variable = colnames(papers),
             nbr_missing = colSums(is.na(papers))) %>%
    dplyr::mutate(source = source_track[variable]),
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)

Number of published papers per month and year

ggplot(papers %>% 
         dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month")) %>%
         dplyr::group_by(pubmonth) %>%
         dplyr::summarize(npub = n()), 
       aes(x = factor(pubmonth), y = npub)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  labs(x = "", y = "Number of published papers per month", caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggplot(papers %>% 
         dplyr::mutate(pubyear = lubridate::year(published.date)) %>%
         dplyr::group_by(pubyear) %>%
         dplyr::summarize(npub = n()), 
       aes(x = factor(pubyear), y = npub)) + 
  geom_bar(stat = "identity") + theme_minimal() + 
  labs(x = "", y = "Number of published papers per year", caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

The plots below illustrate the fraction of pre-review and review issues closed during each month that have the ‘rejected’ label attached.

ggplot(all_rejected, 
       aes(x = factor(closedmonth), y = nbr_rejections/nbr_issues_closed)) + 
  geom_bar(stat = "identity") + 
  theme_minimal() + 
  facet_wrap(~ itype, ncol = 1) + 
  labs(x = "Month of issue closing", y = "Fraction of issues rejected",
       caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

Citation distribution

Papers with 20 or more citations are grouped in the “>=20” category.

ggplot(papers %>% 
         dplyr::mutate(citation_count = replace(citation_count,
                                                citation_count >= 20, ">=20")) %>%
         dplyr::mutate(citation_count = factor(citation_count, 
                                               levels = c(0:20, ">=20"))) %>%
         dplyr::group_by(citation_count) %>%
         dplyr::tally(),
       aes(x = citation_count, y = n)) + 
  geom_bar(stat = "identity") + 
  theme_minimal() + 
  labs(x = "Crossref citation count", y = "Number of publications", caption = dcap)

Most cited papers

The table below sorts the JOSS papers in decreasing order by the number of citations in Crossref.

DT::datatable(
  papers %>% 
    dplyr::mutate(url = paste0("<a href='", url, "' target='_blank'>", 
                               url,"</a>")) %>% 
    dplyr::arrange(desc(citation_count)) %>% 
    dplyr::select(title, url, published.date, citation_count),
  escape = FALSE,
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)

Citation count vs time since publication

plotly::ggplotly(
  ggplot(papers, aes(x = published.date, y = citation_count, label = title)) + 
    geom_point(alpha = 0.5) + theme_bw() + scale_y_sqrt() + 
    geom_smooth() + 
    labs(x = "Date of publication", y = "Crossref citation count", caption = dcap) + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Power law of citation count within each half year

Here, we plot the citation count for all papers published within each half year, sorted in decreasing order.

ggplot(papers %>% dplyr::group_by(halfyear) %>% 
         dplyr::arrange(desc(citation_count)) %>%
         dplyr::mutate(idx = seq_along(citation_count)), 
       aes(x = idx, y = citation_count)) + 
  geom_point(alpha = 0.5) + 
  facet_wrap(~ halfyear, scales = "free") + 
  theme_bw() + 
  labs(x = "Index", y = "Crossref citation count", caption = dcap)

Pre-review/review time over time

In these plots we investigate whether the time a submission spends in the pre-review or review stage (or their sum) has changed over time. The blue curve corresponds to a rolling median for submissions over 120 days.

## Helper functions (modified from https://stackoverflow.com/questions/65147186/geom-smooth-with-median-instead-of-mean)
rolling_median <- function(formula, data, xwindow = 120, ...) {
  ## Get order of x-values and sort x/y
  ordr <- order(data$x)
  x <- data$x[ordr]
  y <- data$y[ordr]
  
  ## Initialize vector for smoothed y-values
  ys <- rep(NA, length(x))
  ## Calculate median y-value for each unique x-value
  for (xs in setdiff(unique(x), NA)) {
    ## Get x-values in the window, and calculate median of corresponding y
    j <- ((xs - xwindow/2) < x) & (x < (xs + xwindow/2))
    ys[x == xs] <- median(y[j], na.rm = TRUE)
  }
  y <- ys
  structure(list(x = x, y = y, f = approxfun(x, y)), class = "rollmed")
}

predict.rollmed <- function(mod, newdata, ...) {
  setNames(mod$f(newdata$x), newdata$x)
}
ggplot(papers, aes(x = prerev_opened, y = as.numeric(days_in_pre))) + 
  geom_point() + 
  geom_smooth(formula = y ~ x, method = "rolling_median", 
              se = FALSE, method.args = list(xwindow = 120)) + 
  theme_bw() + 
  labs(x = "Date of pre-review opening", y = "Number of days in pre-review", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

ggplot(papers, aes(x = review_opened, y = as.numeric(days_in_rev))) + 
  geom_point() +
  geom_smooth(formula = y ~ x, method = "rolling_median", 
              se = FALSE, method.args = list(xwindow = 120)) +
  theme_bw() + 
  labs(x = "Date of review opening", y = "Number of days in review", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

ggplot(papers, aes(x = prerev_opened, 
                   y = as.numeric(days_in_pre) + as.numeric(days_in_rev))) + 
  geom_point() +
  geom_smooth(formula = y ~ x, method = "rolling_median", 
              se = FALSE, method.args = list(xwindow = 120)) +
  theme_bw() + 
  labs(x = "Date of pre-review opening", y = "Number of days in pre-review + review", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

Languages

Next, we consider the languages used by the submissions, both as reported by Whedon and based on the information encoded in available GitHub repositories (for the latter, we also record the number of bytes of code written in each language). Note that a given submission can use multiple languages.

## Language information from Whedon
sspl <- strsplit(papers$languages, ",")
all_languages <- unique(unlist(sspl))
langs <- do.call(dplyr::bind_rows, lapply(all_languages, function(l) {
  data.frame(language = l,
             nbr_submissions_Whedon = sum(vapply(sspl, function(v) l %in% v, 0)))
}))

## Language information from GitHub software repos
a <- lapply(strsplit(papers$repo_languages_bytes, ","), function(w) strsplit(w, ":"))
a <- a[sapply(a, length) > 0]
langbytes <- as.data.frame(t(as.data.frame(a))) %>% 
  setNames(c("language", "bytes")) %>%
  dplyr::mutate(bytes = as.numeric(bytes)) %>%
  dplyr::filter(!is.na(language)) %>%
  dplyr::group_by(language) %>%
  dplyr::summarize(nbr_bytes_GitHub = sum(bytes),
                   nbr_repos_GitHub = length(bytes)) %>%
  dplyr::arrange(desc(nbr_bytes_GitHub))

langs <- dplyr::full_join(langs, langbytes, by = "language")
ggplot(langs %>% dplyr::arrange(desc(nbr_submissions_Whedon)) %>%
         dplyr::filter(nbr_submissions_Whedon > 10) %>%
         dplyr::mutate(language = factor(language, levels = language)),
       aes(x = language, y = nbr_submissions_Whedon)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  labs(x = "", y = "Number of submissions", caption = dcap) + 
  theme(axis.title = element_text(size = 15))

DT::datatable(
  langs %>% dplyr::arrange(desc(nbr_bytes_GitHub)),
  escape = FALSE,
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)
ggplot(langs, aes(x = nbr_repos_GitHub, y = nbr_bytes_GitHub)) + 
  geom_point() + scale_x_log10() + scale_y_log10() + geom_smooth() + 
  theme_bw() + 
  labs(x = "Number of repos using the language",
       y = "Total number of bytes of code\nwritten in the language", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

Association between number of citations and number of stars of the GitHub repo

ggplotly(
  ggplot(papers, aes(x = citation_count, y = repo_nbr_stars,
                     label = title)) + 
    geom_point(alpha = 0.5) + scale_x_sqrt() + scale_y_sqrt() + 
    theme_bw() + 
    labs(x = "Crossref citation count", y = "Number of stars, GitHub repo", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Distribution of time between GitHub repo creation and JOSS submission

ggplot(papers, aes(x = as.numeric(prerev_opened - repo_created))) +
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(x = "Time (days) from repo creation to JOSS pre-review start", 
       caption = dcap) + 
  theme(axis.title = element_text(size = 15))

Distribution of time between JOSS acceptance and last commit

ggplot(papers, aes(x = as.numeric(repo_pushed - review_closed))) +
  geom_histogram(bins = 50) + 
  theme_bw() + 
  labs(x = "Time (days) from closure of JOSS review to most recent commit in repo",
       caption = dcap) + 
  theme(axis.title = element_text(size = 15)) + 
  facet_wrap(~ year(published.date), scales = "free_y")

Number of authors per paper

List the papers with the largest number of authors, and display the distribution of the number of authors per paper, for papers with at most 20 authors.

## Papers with largest number of authors
papers %>% dplyr::arrange(desc(nbr_authors)) %>% 
  dplyr::select(title, published.date, url, nbr_authors) %>%
  as.data.frame() %>% head(10)
##                                                                                                                          title
## 1                                                                                    SunPy: A Python package for Solar Physics
## 2                                                        ENZO: An Adaptive Mesh Refinement Code for Astrophysics (Version 2.6)
## 3  The Pencil Code, a modular MPI code for partial differential equations and particles: multipurpose and multiuser-maintained
## 4                                                     GRChombo: An adaptable numerical relativity code for fundamental physics
## 5                                                                                       PyBIDS: Python tools for BIDS datasets
## 6                                       DataLad: distributed system for joint management of code, data, and their relationship
## 7                                                                            Chaste: Cancer, Heart and Soft Tissue Environment
## 8                                        NOMAD: A distributed web-based platform for managing\nmaterials science research data
## 9                                                                           spam: Software for Practical Analysis of Materials
## 10                                                      SNEWPY: A Data Pipeline from Supernova Simulations to Neutrino Signals
##    published.date                                   url nbr_authors
## 1      2020-02-14 http://dx.doi.org/10.21105/joss.01832         124
## 2      2019-10-03 http://dx.doi.org/10.21105/joss.01636          55
## 3      2021-02-21 http://dx.doi.org/10.21105/joss.02807          38
## 4      2021-12-10 http://dx.doi.org/10.21105/joss.03703          32
## 5      2019-08-12 http://dx.doi.org/10.21105/joss.01294          31
## 6      2021-07-01 http://dx.doi.org/10.21105/joss.03262          31
## 7      2020-03-13 http://dx.doi.org/10.21105/joss.01848          29
## 8      2023-10-15 http://dx.doi.org/10.21105/joss.05388          29
## 9      2020-07-13 http://dx.doi.org/10.21105/joss.02286          27
## 10     2021-11-27 http://dx.doi.org/10.21105/joss.03772          26
nbins <- max(papers$nbr_authors[papers$nbr_authors <= 20])
ggplot(papers %>% dplyr::filter(nbr_authors <= 20),
  aes(x = nbr_authors)) + 
  geom_histogram(bins = nbins, fill = "lightgrey", color = "grey50") + 
  theme_bw() + 
  facet_wrap(~ year(published.date), scales = "free_y") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Number of authors",
       y = "Number of publications with\na given number of authors", 
       caption = dcap)

ggplot(papers %>% 
         dplyr::mutate(nbr_authors = replace(nbr_authors, nbr_authors > 5, ">5")) %>%
         dplyr::mutate(nbr_authors = factor(nbr_authors, levels = c("1", "2", "3", 
                                                                    "4", "5", ">5"))) %>%
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(year = factor(year)) %>%
         dplyr::group_by(year, nbr_authors, .drop = FALSE) %>%
         dplyr::summarize(n = n()) %>%
         dplyr::mutate(freq = n/sum(n)) %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = nbr_authors)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Number of\nauthors", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Fraction of submissions", caption = dcap)

Number of authors vs number of contributors to the GitHub repo

Note that points are slightly jittered to reduce the overlap.

plotly::ggplotly(
  ggplot(papers, aes(x = nbr_authors, y = repo_nbr_contribs_2ormore, label = title)) + 
    geom_abline(slope = 1, intercept = 0) + 
    geom_jitter(width = 0.05, height = 0.05, alpha = 0.5) + 
    # geom_point(alpha = 0.5) + 
    theme_bw() + 
    scale_x_sqrt() + scale_y_sqrt() + 
    labs(x = "Number of authors", 
         y = "Number of contributors\nwith at least 2 commits", 
         caption = dcap) + 
    theme(axis.title = element_text(size = 15)),
  tooltip = c("label", "x", "y")
)

Number of reviewers per paper

Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.

ggplot(papers %>%
         dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
         dplyr::mutate(year = year(published.date)),
       aes(x = nbr_reviewers)) + geom_bar() + 
  facet_wrap(~ year) + theme_bw() + 
  labs(x = "Number of reviewers", y = "Number of submissions", caption = dcap)

Most active reviewers

Submissions associated with rOpenSci and pyOpenSci are not considered here, since they are not explicitly reviewed at JOSS.

reviewers <- papers %>% 
  dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) %>%
  dplyr::mutate(year = year(published.date)) %>%
  dplyr::select(reviewers, year) %>%
  tidyr::separate_rows(reviewers, sep = ",")

## Most active reviewers
DT::datatable(
  reviewers %>% dplyr::group_by(reviewers) %>%
    dplyr::summarize(nbr_reviews = length(year),
                     timespan = paste(unique(c(min(year), max(year))), 
                                      collapse = " - ")) %>%
    dplyr::arrange(desc(nbr_reviews)),
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE)
)

Number of papers per editor and year

ggplot(papers %>% 
         dplyr::mutate(year = year(published.date),
                       `r/pyOpenSci` = factor(
                         grepl("rOpenSci|pyOpenSci", prerev_labels),
                         levels = c("TRUE", "FALSE"))), 
       aes(x = editor)) + geom_bar(aes(fill = `r/pyOpenSci`)) + 
  theme_bw() + facet_wrap(~ year, ncol = 1) + 
  scale_fill_manual(values = c(`TRUE` = "grey65", `FALSE` = "grey35")) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  labs(x = "Editor", y = "Number of submissions", caption = dcap)

Distribution of software repo licenses

all_licenses <- sort(unique(papers$repo_license))
license_levels = c(grep("apache", all_licenses, value = TRUE),
                   grep("bsd", all_licenses, value = TRUE),
                   grep("mit", all_licenses, value = TRUE),
                   grep("gpl", all_licenses, value = TRUE),
                   grep("mpl", all_licenses, value = TRUE))
license_levels <- c(license_levels, setdiff(all_licenses, license_levels))
ggplot(papers %>% 
         dplyr::mutate(repo_license = factor(repo_license, 
                                             levels = license_levels)),
       aes(x = repo_license)) +
  geom_bar() + 
  theme_bw() + 
  labs(x = "Software license", y = "Number of submissions", caption = dcap) + 
  theme(axis.title = element_text(size = 15),
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 
  facet_wrap(~ year(published.date), scales = "free_y")

## For plots below, replace licenses present in less 
## than 2.5% of the submissions by 'other'
tbl <- table(papers$repo_license)
to_replace <- names(tbl[tbl <= 0.025 * nrow(papers)])
ggplot(papers %>% 
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(repo_license = replace(repo_license, 
                                              repo_license %in% to_replace,
                                              "other")) %>%
         dplyr::mutate(year = factor(year), 
                       repo_license = factor(
                         repo_license, 
                         levels = license_levels[license_levels %in% repo_license]
                       )) %>%
         dplyr::group_by(year, repo_license, .drop = FALSE) %>%
         dplyr::count() %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = n, fill = repo_license)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Number of submissions", caption = dcap)

ggplot(papers %>% 
         dplyr::mutate(year = year(published.date)) %>%
         dplyr::mutate(repo_license = replace(repo_license, 
                                              repo_license %in% to_replace,
                                              "other")) %>%
         dplyr::mutate(year = factor(year), 
                       repo_license = factor(
                         repo_license, 
                         levels = license_levels[license_levels %in% repo_license]
                       )) %>%
         dplyr::group_by(year, repo_license, .drop = FALSE) %>%
         dplyr::summarize(n = n()) %>%
         dplyr::mutate(freq = n/sum(n)) %>%
         dplyr::mutate(year = as.integer(as.character(year))), 
       aes(x = year, y = freq, fill = repo_license)) + geom_area() + 
  theme_minimal() + 
  scale_fill_brewer(palette = "Set1", name = "Software\nlicense", 
                    na.value = "grey") + 
  theme(axis.title = element_text(size = 15)) + 
  labs(x = "Year", y = "Fraction of submissions", caption = dcap)

Most common GitHub repo topics

a <- unlist(strsplit(papers$repo_topics, ","))
a <- a[!is.na(a)]
topicfreq <- table(a)

colors <- viridis::viridis(100)
set.seed(1234)
wordcloud::wordcloud(
  names(topicfreq), sqrt(topicfreq), min.freq = 1, max.words = 300,
  random.order = FALSE, rot.per = 0.05, use.r.layout = FALSE, 
  colors = colors, scale = c(10, 0.1), random.color = TRUE,
  ordered.colors = FALSE, vfont = c("serif", "plain")
)

DT::datatable(as.data.frame(topicfreq) %>% 
                dplyr::rename(topic = a, nbr_repos = Freq) %>%
                dplyr::arrange(desc(nbr_repos)),
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE))

Citation analysis

Here, we take a more detailed look at the papers that cite JOSS papers, using data from the Open Citations Corpus.

Get citing papers for each submission

citations <- tryCatch({
  citecorp::oc_coci_cites(doi = papers$alternative.id) %>%
    dplyr::distinct() %>%
    dplyr::mutate(citation_info_obtained = as.character(lubridate::today()))
}, error = function(e) {
  NULL
})
dim(citations)
## [1] 43222     8
if (!is.null(citations)) {
  citations <- citations %>% 
    dplyr::filter(!(oci %in% citations_archive$oci))
  
  tmpj <- rcrossref::cr_works(dois = unique(citations$citing))$data %>%
    dplyr::select(contains("doi"), contains("container.title"), contains("issn"),
                  contains("type"), contains("publisher"), contains("prefix"))
  citations <- citations %>% dplyr::left_join(tmpj, by = c("citing" = "doi"))
  
  ## bioRxiv preprints don't have a 'container.title' or 'issn', but we'll assume 
  ## that they can be 
  ## identified from the prefix 10.1101 - set the container.title 
  ## for these records manually; we may or may not want to count these
  ## (would it count citations twice, both preprint and publication?)
  citations$container.title[citations$prefix == "10.1101"] <- "bioRxiv"
  
  ## JOSS is represented by 'The Journal of Open Source Software' as well as 
  ## 'Journal of Open Source Software'
  citations$container.title[citations$container.title == 
                              "Journal of Open Source Software"] <- 
    "The Journal of Open Source Software"
  
  ## Remove real self citations (cited DOI = citing DOI)
  citations <- citations %>% dplyr::filter(cited != citing)
  
  ## Merge with the archive
  citations <- dplyr::bind_rows(citations, citations_archive)
} else {
  citations <- citations_archive
  if (is.null(citations[["citation_info_obtained"]])) {
    citations$citation_info_obtained <- NA_character_
  }
}

citations$citation_info_obtained[is.na(citations$citation_info_obtained)] <- 
  "2021-08-11"

write.table(citations, file = "joss_submission_citations.tsv",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

Summary statistics

## Latest successful update of new citation data
max(as.Date(citations$citation_info_obtained))
## [1] "2023-12-06"
## Number of JOSS papers with >0 citations included in this collection
length(unique(citations$cited))
## [1] 1496
## Number of JOSS papers with >0 citations according to Crossref
length(which(papers$citation_count > 0))
## [1] 1691
## Number of citations from Open Citations Corpus vs Crossref
df0 <- papers %>% dplyr::select(doi, citation_count) %>%
  dplyr::full_join(citations %>% dplyr::group_by(cited) %>%
                     dplyr::tally() %>%
                     dplyr::mutate(n = replace(n, is.na(n), 0)),
                   by = c("doi" = "cited"))
## Total citation count Crossref
sum(df0$citation_count, na.rm = TRUE)
## [1] 53933
## Total citation count Open Citations Corpus
sum(df0$n, na.rm = TRUE)
## [1] 72004
## Ratio of total citation count Open Citations Corpus/Crossref
sum(df0$n, na.rm = TRUE)/sum(df0$citation_count, na.rm = TRUE)
## [1] 1.335064
ggplot(df0, aes(x = citation_count, y = n)) + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_point(size = 3, alpha = 0.5) + 
  labs(x = "Crossref citation count", y = "Open Citations Corpus citation count",
       caption = dcap) + 
  theme_bw()

## Zoom in
ggplot(df0, aes(x = citation_count, y = n)) + 
  geom_abline(slope = 1, intercept = 0) + 
  geom_point(size = 3, alpha = 0.5) + 
  labs(x = "Crossref citation count", y = "Open Citations Corpus citation count",
       caption = dcap) + 
  theme_bw() + 
  coord_cartesian(xlim = c(0, 75), ylim = c(0, 75))

## Number of journals citing JOSS papers
length(unique(citations$container.title))
## [1] 8225
length(unique(citations$issn))
## [1] 6149

Most citing journals

topcit <- citations %>% dplyr::group_by(container.title) %>%
  dplyr::summarize(nbr_citations_of_joss_papers = length(cited),
                   nbr_cited_joss_papers = length(unique(cited)),
                   nbr_citing_papers = length(unique(citing)),
                   nbr_selfcitations_of_joss_papers = sum(author_sc == "yes"),
                   fraction_selfcitations = signif(nbr_selfcitations_of_joss_papers /
                     nbr_citations_of_joss_papers, digits = 3)) %>%
  dplyr::arrange(desc(nbr_cited_joss_papers))
DT::datatable(topcit,
  escape = FALSE, rownames = FALSE, 
  filter = list(position = 'top', clear = FALSE),
  options = list(scrollX = TRUE))
plotly::ggplotly(
  ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
                     label = container.title)) + 
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
    geom_point(size = 3, alpha = 0.5) + 
    theme_bw() + 
    labs(caption = dcap, x = "Number of citations of JOSS papers",
         y = "Number of cited JOSS papers")
)
plotly::ggplotly(
  ggplot(topcit, aes(x = nbr_citations_of_joss_papers, y = nbr_cited_joss_papers,
                     label = container.title)) + 
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
    geom_point(size = 3, alpha = 0.5) + 
    theme_bw() + 
    coord_cartesian(xlim = c(0, 100), ylim = c(0, 50)) + 
    labs(caption = dcap, x = "Number of citations of JOSS papers",
         y = "Number of cited JOSS papers")
)
write.table(topcit, file = "joss_submission_citations_byjournal.tsv",
            row.names = FALSE, col.names = TRUE, sep = "\t", quote = FALSE)

Save object

The tibble object with all data collected above is serialized to a file that can be downloaded and reused.

head(papers) %>% as.data.frame()
##        alternative.id                 container.title    created  deposited
## 1 10.21105/joss.03453 Journal of Open Source Software 2021-08-06 2021-08-06
## 2 10.21105/joss.02583 Journal of Open Source Software 2020-09-26 2020-09-26
## 3 10.21105/joss.03362 Journal of Open Source Software 2021-08-22 2021-08-22
## 4 10.21105/joss.02608 Journal of Open Source Software 2020-11-05 2022-11-26
## 5 10.21105/joss.02013 Journal of Open Source Software 2020-02-10 2020-02-10
## 6 10.21105/joss.02181 Journal of Open Source Software 2020-07-16 2020-07-16
##   published.print                 doi    indexed      issn issue     issued
## 1      2021-08-06 10.21105/joss.03453 2022-03-29 2475-9066    64 2021-08-06
## 2      2020-09-26 10.21105/joss.02583 2022-03-28 2475-9066    53 2020-09-26
## 3      2021-08-22 10.21105/joss.03362 2022-03-29 2475-9066    64 2021-08-22
## 4      2020-11-05 10.21105/joss.02608 2022-11-27 2475-9066    55 2020-11-05
## 5      2020-02-10 10.21105/joss.02013 2023-05-19 2475-9066    46 2020-02-10
## 6      2020-07-16 10.21105/joss.02181 2023-06-16 2475-9066    51 2020-07-16
##   member page   prefix        publisher score   source reference.count
## 1   8722 3453 10.21105 The Open Journal     0 Crossref               8
## 2   8722 2583 10.21105 The Open Journal     0 Crossref              28
## 3   8722 3362 10.21105 The Open Journal     0 Crossref               7
## 4   8722 2608 10.21105 The Open Journal     0 Crossref              10
## 5   8722 2013 10.21105 The Open Journal     0 Crossref              12
## 6   8722 2181 10.21105 The Open Journal     0 Crossref              15
##   references.count is.referenced.by.count
## 1                8                      0
## 2               28                      0
## 3                7                      0
## 4               10                      0
## 5               12                      6
## 6               15                      5
##                                                                                                                                                     title
## 1                                                                                                               mcboost: Multi-Calibration Boosting for R
## 2                                                                 emba: R package for analysis and visualization of biomarkers in boolean model ensembles
## 3                                                                                               OSTIR: open source translation initiation rate prediction
## 4                                                                             AstroPaint: A Python Package for Painting Halo Catalogs into Celestial Maps
## 5 thresholdmodeling: A Python package for modeling excesses over a threshold using the Peak-Over-Threshold Method and the Generalized Pareto Distribution
## 6                                                                 ldaPrototype: A method in R to get a Prototype of multiple Latent Dirichlet Allocations
##              type                                   url volume
## 1 journal-article http://dx.doi.org/10.21105/joss.03453      6
## 2 journal-article http://dx.doi.org/10.21105/joss.02583      5
## 3 journal-article http://dx.doi.org/10.21105/joss.03362      6
## 4 journal-article http://dx.doi.org/10.21105/joss.02608      5
## 5 journal-article http://dx.doi.org/10.21105/joss.02013      5
## 6 journal-article http://dx.doi.org/10.21105/joss.02181      5
##   short.container.title
## 1                  JOSS
## 2                  JOSS
## 3                  JOSS
## 4                  JOSS
## 5                  JOSS
## 6                  JOSS
##                                                                                                                                                                                                                                                                                                                                                                  author
## 1 http://orcid.org/0000-0001-8867-762X, http://orcid.org/0000-0001-7363-4299, http://orcid.org/0000-0003-4324-4163, NA, NA, http://orcid.org/0000-0001-6002-6980, FALSE, FALSE, FALSE, NA, NA, FALSE, Florian, Christoph, Susanne, Matthew, Michael, Bernd, Pfisterer, Kern, Dandl, Sun, Kim, Bischl, first, additional, additional, additional, additional, additional
## 2                                                                                                                                                   http://orcid.org/0000-0002-3609-8674, http://orcid.org/0000-0002-1171-9876, http://orcid.org/0000-0002-3357-425X, FALSE, FALSE, FALSE, John, Martin, Åsmund, Zobolas, Kuiper, Flobak, first, additional, additional
## 3                                                                                                                                                                             http://orcid.org/0000-0001-6219-3168, NA, http://orcid.org/0000-0003-0888-7358, FALSE, NA, FALSE, Cameron, Alexandra, Jeffrey, Roots, Lukasiewicz, Barrick, first, additional, additional
## 4                          http://orcid.org/0000-0003-1978-6325, NA, http://orcid.org/0000-0002-4619-8927, NA, NA, NA, NA, FALSE, NA, FALSE, NA, NA, NA, NA, Siavash, Marcelo, Emmanuel, Karime, Shobeir, Nareg, Elena, Yasini, Alvarez, Schaan, Maamari, Mazinani, Mirzatuny, Pierpaoli, first, additional, additional, additional, additional, additional, additional
## 5                                                                                                                                                      http://orcid.org/0000-0002-5829-7711, http://orcid.org/0000-0003-0170-6083, http://orcid.org/0000-0002-8166-5666, FALSE, FALSE, FALSE, Iago, Antônio, Marcus, Lemos, Lima, Duarte, first, additional, additional
## 6                                                                                                                                                                                                                                                                                                     http://orcid.org/0000-0002-0007-4478, FALSE, Jonas, Rieger, first
##                                                                        subject
## 1                                                                         <NA>
## 2                                                                         <NA>
## 3                                                                         <NA>
## 4 Pulmonary and Respiratory Medicine,Pediatrics, Perinatology and Child Health
## 5                                                                         <NA>
## 6                                                                         <NA>
##   citation_count
## 1              0
## 2              0
## 3              0
## 4              0
## 5              6
## 6              5
##                                                                                                                                                 api_title
## 1                                                                                                               mcboost: Multi-Calibration Boosting for R
## 2                                                                 emba: R package for analysis and visualization of biomarkers in boolean model ensembles
## 3                                                                                               OSTIR: open source translation initiation rate prediction
## 4                                                                             AstroPaint: A Python Package for Painting Halo Catalogs into Celestial Maps
## 5 thresholdmodeling: A Python package for modeling excesses over a threshold using the Peak-Over-Threshold Method and the Generalized Pareto Distribution
## 6                                                                 ldaPrototype: A method in R to get a Prototype of multiple Latent Dirichlet Allocations
##   api_state         editor                   reviewers nbr_reviewers
## 1  accepted     @osorensen              @mwt,@OwenWard             2
## 2  accepted        @mikldk @neerajdhanraj,@edifice1989             2
## 3  accepted      @csoneson     @ayush9pandey,@standage             2
## 4  accepted      @harpolea            @AshKelly,@zpace             2
## 5  accepted @drvinceknight         @bahung,@kellieotto             2
## 6  accepted       @karthik       @tommyjones,@bstewart             2
##                                          repo_url review_issue_id
## 1              https://github.com/mlr-org/mcboost            3453
## 2                https://github.com/bblodfon/emba            2583
## 3             https://github.com/barricklab/ostir            3362
## 4           https://github.com/syasini/AstroPaint            2608
## 5 https://github.com/iagolemos1/thresholdmodeling            2013
## 6     https://github.com/JonasRieger/ldaPrototype            2181
##   prereview_issue_id               languages
## 1               3352                       R
## 2               2534                       R
## 3               3342                R,Python
## 4               2567 Python,Jupyter Notebook
## 5               1999                  Python
## 6               2156                       R
##                              archive_doi
## 1 https://doi.org/10.5281/zenodo.5156518
## 2 https://doi.org/10.5281/zenodo.4043085
## 3 https://doi.org/10.5281/zenodo.5227845
## 4 https://doi.org/10.5281/zenodo.4243176
## 5 https://doi.org/10.5281/zenodo.3661338
## 6 https://doi.org/10.5281/zenodo.3945836
##                                                                                                                                              review_title
## 1                                                                                                               mcboost: Multi-Calibration Boosting for R
## 2                                                                 emba: R package for analysis and visualization of biomarkers in boolean model ensembles
## 3                                                                                               OSTIR: open source translation initiation rate prediction
## 4                                                                             AstroPaint: A Python Package for Painting Halo Catalogs into Celestial Maps
## 5 thresholdmodeling: A Python package for modeling excesses over a threshold using the Peak-Over-Threshold Method and the Generalized Pareto Distribution
## 6                                                                 ldaPrototype: A method in R to get a Prototype of multiple Latent Dirichlet Allocations
##   review_number review_state review_opened review_closed review_ncomments
## 1          3453       closed    2021-07-04    2021-08-06               42
## 2          2583       closed    2020-08-19    2020-09-26               76
## 3          3362       closed    2021-06-11    2021-08-22               59
## 4          2608       closed    2020-08-28    2020-11-05               37
## 5          2013       closed    2020-01-13    2020-02-10               69
## 6          2181       closed    2020-05-03    2020-07-16               60
##                                      review_labels
## 1        accepted,TeX,R,recommend-accept,published
## 2        accepted,TeX,R,recommend-accept,published
## 3 accepted,TeX,Python,R,recommend-accept,published
## 4              accepted,recommend-accept,published
## 5              accepted,recommend-accept,published
## 6              accepted,recommend-accept,published
##                                                                                                                                              prerev_title
## 1                                                                                                               mcboost: Multi-Calibration Boosting for R
## 2                                                                 emba: R package for analysis and visualization of biomarkers in boolean model ensembles
## 3                                                                                               OSTIR: open source translation initiation rate prediction
## 4                                                                             AstroPaint: A Python Package for Painting Halo Catalogs into Celestial Maps
## 5 thresholdmodeling: A Python package for modeling excesses over a threshold using the Peak-Over-Threshold Method and the Generalized Pareto Distribution
## 6                                                                 ldaPrototype: A method in R to get a Prototype of multiple Latent Dirichlet Allocations
##   prerev_state prerev_opened prerev_closed prerev_ncomments prerev_labels
## 1       closed    2021-06-10    2021-07-04               34         TeX,R
## 2       closed    2020-07-28    2020-08-19               32         TeX,R
## 3       closed    2021-06-08    2021-06-11               24  TeX,Python,R
## 4       closed    2020-08-14    2020-08-28               32              
## 5       closed    2020-01-07    2020-01-13               37    TeX,Python
## 6       closed    2020-03-10    2020-05-03               31         TeX,R
##   days_in_pre days_in_rev to_review repo_created repo_updated repo_pushed
## 1     24 days     33 days      TRUE   2020-12-28   2024-01-12  2024-03-11
## 2     22 days     38 days      TRUE   2019-06-03   2023-04-26  2023-04-26
## 3      3 days     72 days      TRUE   2019-05-21   2024-02-16  2024-03-14
## 4     14 days     69 days      TRUE   2019-09-27   2024-01-13  2024-01-05
## 5      6 days     28 days      TRUE   2019-12-27   2024-03-20  2020-12-24
## 6     54 days     74 days      TRUE   2019-05-21   2023-01-31  2023-01-31
##   repo_nbr_stars    repo_language
## 1             27                R
## 2              0                R
## 3              9           Python
## 4             41 Jupyter Notebook
## 5             30           Python
## 6              7                R
##                              repo_languages_bytes
## 1                               R:125381,TeX:5117
## 2                              R:211165,TeX:45603
## 3                  Python:107327,R:10876,TeX:4642
## 4 Jupyter Notebook:6802687,Python:147492,TeX:5264
## 5                           Python:37812,TeX:3472
## 6                     R:149724,TeX:5269,Shell:157
##                                                                                                                                                                repo_topics
## 1                                    machine-learning,classification,fairness,fairness-ml,fairness-ai,responsible-ai,bias-correction,bias-detection,post-processing,ethics
## 2                                                                                                                                   r,r-package,biomarkers,ensemble-models
## 3                                                                                                                           synthetic-biology,bioengineering,rna-structure
## 4                                                                                                   cosmology,astrophysical-signals,halo-catalog,python,simulation-toolkit
## 5                                                                                                                                                                         
## 6 topicmodeling,topicmodelling,lda,topic-models,topic-model,topic-similarities,text-mining,textdata,latent-dirichlet-allocation,modelselection,model-selection,reliability
##   repo_license repo_nbr_contribs repo_nbr_contribs_2ormore repo_info_obtained
## 1        other                 7                         6         2024-03-27
## 2        other                 2                         1         2024-03-27
## 3      gpl-3.0                 3                         3         2024-03-27
## 4          mit                 6                         3         2024-03-27
## 5     lgpl-3.0                 4                         2         2024-03-27
## 6      gpl-3.0                 1                         1         2024-03-27
##   published.date halfyear nbr_authors
## 1     2021-08-06   2021H2           6
## 2     2020-09-26   2020H2           3
## 3     2021-08-22   2021H2           3
## 4     2020-11-05   2020H2           7
## 5     2020-02-10   2020H1           3
## 6     2020-07-16   2020H2           1
saveRDS(papers, file = "joss_submission_analytics.rds")

To read the current version of this file directly from GitHub, use the following code:

papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))

Session info

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: UTC
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] readr_2.1.5       citecorp_0.3.0    plotly_4.10.4     DT_0.32          
##  [5] jsonlite_1.8.8    purrr_1.0.2       gh_1.4.1          lubridate_1.9.3  
##  [9] ggplot2_3.5.0     tidyr_1.3.1       dplyr_1.1.4       rcrossref_1.2.009
## [13] tibble_3.2.1     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   viridisLite_0.4.2  farver_2.1.1       urltools_1.7.3    
##  [5] viridis_0.6.5      fastmap_1.1.1      lazyeval_0.2.2     promises_1.2.1    
##  [9] digest_0.6.35      timechange_0.3.0   mime_0.12          lifecycle_1.0.4   
## [13] magrittr_2.0.3     compiler_4.3.3     rlang_1.1.3        sass_0.4.9        
## [17] tools_4.3.3        wordcloud_2.6      utf8_1.2.4         yaml_2.3.8        
## [21] data.table_1.15.4  knitr_1.45         fauxpas_0.5.2      labeling_0.4.3    
## [25] htmlwidgets_1.6.4  bit_4.0.5          curl_5.2.1         plyr_1.8.9        
## [29] xml2_1.3.6         RColorBrewer_1.1-3 httpcode_0.3.0     miniUI_0.1.1.1    
## [33] withr_3.0.0        triebeard_0.4.1    grid_4.3.3         fansi_1.0.6       
## [37] xtable_1.8-4       colorspace_2.1-0   gitcreds_0.1.2     scales_1.3.0      
## [41] crul_1.4.0         cli_3.6.2          rmarkdown_2.26     crayon_1.5.2      
## [45] generics_0.1.3     httr_1.4.7         tzdb_0.4.0         cachem_1.0.8      
## [49] stringr_1.5.1      splines_4.3.3      parallel_4.3.3     vctrs_0.6.5       
## [53] Matrix_1.6-5       hms_1.1.3          bit64_4.0.5        crosstalk_1.2.1   
## [57] jquerylib_0.1.4    glue_1.7.0         stringi_1.8.3      gtable_0.3.4      
## [61] later_1.3.2        munsell_0.5.1      pillar_1.9.0       rappdirs_0.3.3    
## [65] htmltools_0.5.8    R6_2.5.1           httr2_1.0.1        vroom_1.6.5       
## [69] evaluate_0.23      shiny_1.8.1        lattice_0.22-5     highr_0.10        
## [73] httpuv_1.6.15      bslib_0.7.0        Rcpp_1.0.12        gridExtra_2.3     
## [77] nlme_3.1-164       mgcv_1.9-1         whisker_0.4.1      xfun_0.43         
## [81] pkgconfig_2.0.3